package cn.wcg.crawler;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.Random;

import static cn.wcg.crawler.HBaseApi.*;


public class HttpClientPool {
    public static void main(String[] args) throws IOException {
        alter_data();
    }

    public static void alter_data() throws IOException {
        init();
        // 创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        cm.setMaxTotal(1000);
        // 设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(400);
        int k = 0;
        Random r = new Random();
        int random_num;
        for (int i = 1; i <= 271; i++) {
            System.out.println("爬取第" + i + "页");
            String html = doGet(cm, i); // 拿到爬下来的单页数据
            //System.out.println(html);
            Document doc = Jsoup.parse(html); //拿到数据
            // 根据 J_jobsList yli获取每页数据19条
            Elements elements = doc.select(".J_jobsList");
            System.out.println("职位条数：" + elements.size());
            for (Element element : elements) {
                int jid = Integer.parseInt(element.attr("data-jid")); // 获取到每个岗位的id
                // 进入详情页获取具体信息
                String htmlSecond = doGetSecond(cm, jid);//拿到详情页内容
                // 对拿到的数据进行处理、找出岗位数、行业分布、学历限制、工资分布....
                Document ifo = Jsoup.parse(htmlSecond); //拿到数据

                // 网址
                String url = "http://ynsrc.com/jobs/"+jid+".html";
                System.out.println("爬取" + url);
                if (htmlSecond.equals("")) {
                    System.out.println("爬了个寂寞，烦");
                } else {
                    System.out.println("爬到了页面");
                }
                // 岗位信息
                Elements wages = ifo.select(".wage");
                String wage = wages.get(0).text();           // 工资
                Elements itemli = ifo.select(".itemli");
                String age_str = "";
                if (itemli.size() != 0) {
                    age_str = itemli.get(5).lastChild().toString();        // 年龄要求
                }

                // 数据清洗
                Integer min_wage = 0;
                Integer max_wage = 0;
                if (!wage.equals("")) {
                    if (wage.contains("-")){
                        int s1 = wage.indexOf('-');
                        int s2 = wage.lastIndexOf('K');
                        min_wage = Math.toIntExact(Math.round(Double.valueOf(wage.substring(0, s1 - 1)) * 1000));
                        max_wage = Math.toIntExact(Math.round(Double.valueOf(wage.substring(s1+1, s2)) * 1000));
                    }else if (wage.equals("面议")) {    // 面议
                        min_wage = -1;
                        max_wage = -1;
                    }else {
                        int s1 = wage.lastIndexOf('K');
                        min_wage = Math.toIntExact(Math.round(Double.valueOf(wage.substring(0, s1)) * 1000));
                        max_wage = min_wage;
                    }
                } else {
                    min_wage = -2;
                    max_wage = -2;
                }

                // 存储数据
                // 存储工作信息
                String table_name = "data";
                String[] family = {"work", "company"};
                System.out.println(k+wage);
                System.out.println(min_wage+" "+max_wage);
                System.out.println(age_str);
                put_int_data(table_name, String.valueOf(k), family[0], "min_wage", min_wage);
                put_int_data(table_name, String.valueOf(k), family[0], "max_wage", max_wage);
                put_string_data(table_name, String.valueOf(k), family[0], "age", age_str);

                k++;
                // 随机等待时间
                random_num = r.nextInt(1) + 1;
                try {
                    Thread.sleep(random_num * 1000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
        cm.close();
    }

    public static void spider() throws IOException {
        init();
        // 创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        cm.setMaxTotal(1000);
        // 设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(400);
        int k = 5399;
        Random r = new Random();
        int random_num;
        for (int i = 271; i <= 271; i++) {//一共21页的数据信息
            System.out.println("爬取第" + i + "页");
            String html = doGet(cm, i); // 拿到爬下来的单页数据
            //System.out.println(html);
            Document doc = Jsoup.parse(html); //拿到数据
            // 根据 J_jobsList yli获取每页数据19条
            Elements elements = doc.select(".J_jobsList");
            System.out.println("职位条数：" + elements.size());
            for (Element element : elements) {
                int jid = Integer.parseInt(element.attr("data-jid")); // 获取到每个岗位的id
                // 进入详情页获取具体信息
                String htmlSecond = doGetSecond(cm, jid);//拿到详情页内容
                // 对拿到的数据进行处理、找出岗位数、行业分布、学历限制、工资分布....
                Document ifo = Jsoup.parse(htmlSecond); //拿到数据

                // 网址
                String url = "http://ynsrc.com/jobs/"+jid+".html";
                System.out.println("爬取" + url);
                if (htmlSecond.equals("")) {
                    System.out.println("爬了个寂寞，烦");
                } else {
                    System.out.println("爬到了页面");
                }
                // 岗位信息
                String post = ifo.select(".j-n-txt").text();        // 岗位名称
                String wage = ifo.select(".wage").text();           // 工资 1 map
                Elements itemli = ifo.select(".itemli");
                String work = "";
                String numbers = "";
                String education = "";
                String experience = "";
                String age_str = "";
                if (itemli.size() != 0) {
                    work = itemli.get(0).lastChild().toString();       // 工作性质 1
                    numbers = itemli.get(1).lastChild().toString();    // 招聘人数 1
                    education = itemli.get(3).lastChild().toString();   // 学历
                    experience = itemli.get(4).lastChild().toString(); // 工作经验
                    age_str = itemli.get(5).lastChild().toString();        // 年龄要求
                }
                String address = "";  // 工作地点
                if (ifo.select(".add").size() != 0) {
                    address = ifo.select(".add").last().text();  // 工作地点
                }

                // 公司信息
                String company = "";  // 公司名称
                if (ifo.select(".comname").first() != null) {
                    company = ifo.select(".comname").first().text();  // 公司名称
                }
                Elements info = ifo.select(".info");
                String nature = "";
                String industry = "";
                if (info.size() != 0) {
                    nature = info.get(0).lastChild().toString();             // 公司性质
                    industry = info.get(1).lastChild().toString();           // 行业
                }

                // 数据清洗
                Integer min_wage = 0;
                Integer max_wage = 0;
                if (!wage.equals("")) {
                    if (wage.charAt(1) == '千') {
                        // 第一个数字
                        min_wage += Integer.parseInt(String.valueOf(wage.charAt(0))) * 1000;
                        // 有第二个数字
                        if (wage.charAt(2) != '-' && wage.charAt(2) != '/') {
                            min_wage += Integer.parseInt(String.valueOf(wage.charAt(2))) * 100;
                        }
                        Integer index = wage.indexOf('-');
                        // 有max_wage
                        if (index != -1) {
                            max_wage += Integer.parseInt(String.valueOf(wage.charAt(index + 1))) * 1000;
                            // 如果有第二个数字
                            if (wage.charAt(index + 3) != '/') {
                                max_wage += Integer.parseInt(String.valueOf(wage.charAt(index + 3))) * 100;
                            }
                            if (max_wage < min_wage) {
                                max_wage *= 10;
                            }
                        } else { // 没有max_wage
                            max_wage = min_wage;
                        }
                    } else if (wage.charAt(1) == '万') {
                        min_wage += Integer.parseInt(String.valueOf(wage.charAt(0))) * 10000;
                        if (wage.charAt(2) != '-') {
                            min_wage += Integer.parseInt(String.valueOf(wage.charAt(2))) * 1000;
                        }
                        Integer index = wage.indexOf('-');
                        if (index != -1) {
                            max_wage += Integer.parseInt(String.valueOf(wage.charAt(index + 1))) * 10000;
                            if (wage.charAt(index + 3) != '/') {
                                max_wage += Integer.parseInt(String.valueOf(wage.charAt(index + 3))) * 1000;
                            } else {
                                max_wage = min_wage;
                            }
                        }
                    } else if (wage.equals("面议")) {    // 面议
                        min_wage = -1;
                        max_wage = -1;
                    }
                } else {
                    min_wage = -2;
                    max_wage = -2;
                }

                Integer nums = 0;
                if (!numbers.equals("")) {
                    if (numbers.equals("若干人")) {
                        nums = -1;
                    } else {
                        nums += Integer.parseInt(numbers.substring(0, numbers.indexOf('人')));
                    }
                } else {
                    nums = -2;
                }

                Integer min_experience = 0;
                Integer max_experience = 0;
                if (!experience.equals("")) {
                    if (experience.indexOf('-') != -1) {
                        min_experience = Integer.parseInt(experience.substring(0, experience.indexOf('-')));
                        max_experience = Integer.parseInt(experience.substring(experience.indexOf('-') + 1, experience.indexOf('年')));
                    } else if (experience.equals("无经验")||experience.equals("不限") || experience.equals("")) {
                        min_experience = -1;
                        max_experience = -1;
                    } else if (experience.indexOf('年') != -1){
                        min_experience = Integer.parseInt(experience.substring(0, experience.indexOf('年')));
                        max_experience = 100;
                    }
                } else {
                    min_experience = -2;
                    max_experience = -2;
                }

                Integer age = 0;
                if (!age_str.equals("")) {
                    if (age_str.equals("不限")) {
                        age = -1;
                    } else if (age_str.indexOf('岁') != -1){
                        age = Integer.parseInt(age_str.substring(0, age_str.indexOf("岁")));
                    }
                } else {
                    age = -2;
                }

                if (address.equals("")) {
                    address = "没爬下来";
                }

                // 存储数据
                // 存储工作信息
                String table_name = "data";
                String[] family = {"work", "company"};
                put_string_data(table_name, String.valueOf(k), family[0], "post", post);
                put_int_data(table_name, String.valueOf(k), family[0], "min_wage", min_wage);
                put_int_data(table_name, String.valueOf(k), family[0], "max_wage", max_wage);
                put_string_data(table_name, String.valueOf(k), family[0], "work", work);
                put_int_data(table_name, String.valueOf(k), family[0], "nums", nums);
                put_string_data(table_name, String.valueOf(k), family[0], "education", education);
                put_int_data(table_name, String.valueOf(k), family[0], "min_experience", min_experience);
                put_int_data(table_name, String.valueOf(k), family[0], "max_experience", max_experience);
                put_int_data(table_name, String.valueOf(k), family[0], "age", age);
                put_string_data(table_name, String.valueOf(k), family[0], "address", address);
                // 存储公司信息
                put_string_data(table_name, String.valueOf(k), family[1], "company", company);
                put_string_data(table_name, String.valueOf(k), family[1], "nature", nature);
                put_string_data(table_name, String.valueOf(k), family[1], "industry", industry);
                put_string_data(table_name, String.valueOf(k), family[1], "url", url);
                System.out.println(String.valueOf(k) + address);

                k++;
                // 随机等待时间
                random_num = r.nextInt(1) + 1;
                try {
                    Thread.sleep(random_num * 1000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
        cm.close();
    }

    // 获取列表页面
    public static String doGet(PoolingHttpClientConnectionManager cm, int i) {
        // 不是每次创建新的HttpClient，而是从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet("http://ynsrc.com/jobs/jobs_list/page/"+i+".html");

        // 设置请求信息
        // httpGet.setConfig(this.getConfig());
        RequestConfig config = RequestConfig.custom().setConnectTimeout(10000)//创建连接的最长时间，单位毫秒
                .setConnectionRequestTimeout(5000) //设置获取连接的最长时间。单位毫秒
                .setSocketTimeout(100 * 1000) // 设置数据传输的最长时间，单位毫秒
                .build();
        // 给请求设置请求信息
//        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        httpGet.setHeader("Accept-Encoding", "gzip, deflate");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
        httpGet.setHeader("Cache-Control", "max-age=0");
        httpGet.setHeader("Connection", "keep-alive");
        httpGet.setHeader("Cookie", "SECKEY_ABVK=JPB6wSzDB+VMkXCy2oJeOZdnd1OeNdvZqSEJuRtq4NY=; BMAP_SECKEY=ngd4ABOwQ7YIIhDizWtnlMjGc6u1n_uK3VCM71e-_wC-xEKB6xv8zbogH5sCFCW_4cd42CGREN87qTV8EKWToYisMar60K4hR-Nfvwg5Vm6oT8pM86PVq1eUDTF5amGo9jYlKXab0ZUuG2wT03Emh35sfsdacSaKNVboAtCVSakznSQKQyGrf0tLRdE2i3dN; SECKEY_ABVK=JPB6wSzDB+VMkXCy2oJeOVqpnDu94QMYGKFqilJOjHc=; BMAP_SECKEY=ngd4ABOwQ7YIIhDizWtnlCN_inL1tbIRPkQn9HyMJHPeprnm_DLTZdnaW4PzmEwwdPkN2SPLNgSnkR69IwbKtM8HvV0ClsJbLNBJWMN3pw4B0nxYKhTdDPB-C1700bZq9m8BGkOUENjzC6EaCr2MHwUpkryAWa3b-bulPX5YDEwDd-9J48z6Vijy8zl89Mdn; PHPSESSID=e5ciammvbvgu4b8ra1ettb0il5; think_template=default; think_language=zh-CN");
        httpGet.setHeader("Host", "ynsrc.com");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54");
        httpGet.setConfig(config);

        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                // 判断响应体Entity是否不为空，如果不为空就可以使用EntityUtils
                if (response.getEntity() != null) {
                    String content = EntityUtils.toString(response.getEntity(), "utf8");
                    //System.out.println(content); // 返回的content内容没有问题
                    return content;
                } else {
                    System.out.println("entity为空");
                    return "";
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                //不能关闭HttpClient，由连接池管理HttpClient httpClient.close()
            }
            httpGet.releaseConnection();
        }
        return null;
    }

    private RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间，单位毫秒
                .setConnectionRequestTimeout(500) //设置获取连接的最长时间。单位毫秒
                .setSocketTimeout(10 * 1000) // 设置数据传输的最长时间，单位毫秒
                .build();
        return config;
    }

    // 获取详细页面
    public static String doGetSecond(PoolingHttpClientConnectionManager cm, int i) {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        String url = "http://ynsrc.com/jobs/"+i+".html";
        HttpGet httpGet = new HttpGet(url + i);

        // 设置请求信息
        // httpGet.setConfig(this.getConfig());
        RequestConfig config = RequestConfig.custom().setConnectTimeout(10000)//创建连接的最长时间，单位毫秒
                .setConnectionRequestTimeout(5000) //设置获取连接的最长时间。单位毫秒
                .setSocketTimeout(100 * 1000) // 设置数据传输的最长时间，单位毫秒
                .build();
        // 给请求设置请求信息
//        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        httpGet.setHeader("Accept-Encoding", "gzip, deflate");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
        httpGet.setHeader("Cache-Control", "max-age=0");
        httpGet.setHeader("Connection", "keep-alive");
        httpGet.setHeader("Cookie", "SECKEY_ABVK=JPB6wSzDB+VMkXCy2oJeOZdnd1OeNdvZqSEJuRtq4NY=; BMAP_SECKEY=ngd4ABOwQ7YIIhDizWtnlMjGc6u1n_uK3VCM71e-_wC-xEKB6xv8zbogH5sCFCW_4cd42CGREN87qTV8EKWToYisMar60K4hR-Nfvwg5Vm6oT8pM86PVq1eUDTF5amGo9jYlKXab0ZUuG2wT03Emh35sfsdacSaKNVboAtCVSakznSQKQyGrf0tLRdE2i3dN; SECKEY_ABVK=JPB6wSzDB+VMkXCy2oJeOVqpnDu94QMYGKFqilJOjHc=; BMAP_SECKEY=ngd4ABOwQ7YIIhDizWtnlCN_inL1tbIRPkQn9HyMJHPeprnm_DLTZdnaW4PzmEwwdPkN2SPLNgSnkR69IwbKtM8HvV0ClsJbLNBJWMN3pw4B0nxYKhTdDPB-C1700bZq9m8BGkOUENjzC6EaCr2MHwUpkryAWa3b-bulPX5YDEwDd-9J48z6Vijy8zl89Mdn; PHPSESSID=e5ciammvbvgu4b8ra1ettb0il5; think_template=default; think_language=zh-CN");
        httpGet.setHeader("Host", "ynsrc.com");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54");
        httpGet.setConfig(config);

        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                // 判断响应体Entity是否不为空，如果不为空就可以使用EntityUtils
                if (response.getEntity() != null) {
                    String content = EntityUtils.toString(response.getEntity(), "utf8");
                    return content;
                } else {
                    return "";
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                //不能关闭HttpClient，由连接池管理HttpClient httpClient.close()
            }
            httpGet.releaseConnection();
        }
        return null;
    }
}
